###This file makes the dot maps visualizing the distribution of low income employees across nyc

library(feather) 
library(data.table)
library(rgdal)
library(ggmap)
library(sp)

df_precollapse <- data.table(read_feather("Data/employees_to_map.feather")) 
df_precollapse[, UnderMin:=FALSE]
df_precollapse[(df_precollapse$'BaseSalary.2011' < 8.00 | df_precollapse$'BaseSalary.2012' < 8.00 | df_precollapse$'BaseSalary.2013' < 8.00 | df_precollapse$'BaseSalary.2014' < 8.00 | df_precollapse$'BaseSalary.2015' < 9.00 |  df_precollapse$'BaseSalary.2016' == 9.00), UnderMin:=TRUE]
dim(df_precollapse)
df_precollapse <- df_precollapse[UnderMin==T,]; dim(df_precollapse) 

districts <- sort(table(df_precollapse$ad)) 
distpoints_all <- as.data.frame(tail(districts, 65)); colnames(distpoints_all) <- c("distnames", "pointscountfull")

distpoints_all$pointscount <- round(distpoints_all$pointscountfull / 10)

#start by just plotting the state assembly districts and distributing people across those (pull random points within-district)
#wait, now pulling some from here bc they're clipped to the land: https://data.cityofnewyork.us/City-Government/State-Assembly-Districts/pf5b-73bw
ADs <- readOGR(dsn="Data/NYSassemblydistricts", layer="geo_export_936dae27-4cfe-4cec-b848-e870c875cdd3")
dists <- fortify(ADs) #get shapefiles ready to plot
somedists <- subset(ADs, assem_dist %in% distpoints_all$distnames)
somedists_plot <- fortify(somedists)

#now, draw some random points from within each district (as many as there are voters to plot, or maybe divide by 10 or 100)
numpoints <- sum(distpoints_all$pointscount)
pointstorage <- as.data.frame(matrix(NA, nrow=numpoints, ncol=3)); colnames(pointstorage) <- c("id", "x", "y")
rownum <- 1
for (i in 1:nrow(distpoints_all)){ #loop over districts and pull points. 
	id <- distpoints_all$distnames[i] #set the district 
	pointcount <- distpoints_all$pointscount[i] #how many points?
	newrownum <- rownum + pointcount-1 #for storage
	samp <- spsample(somedists[which(somedists$assem_dist == id), ], n= pointcount, type="random") #sample points
	pointstorage$x[rownum:newrownum] <- samp$x #store it
	pointstorage$y[rownum:newrownum] <- samp$y
	pointstorage$id[rownum:newrownum] <-id
	rownum <- newrownum+1
}

#make base map of NYC
#register_google(key="AIzaSyD5tGVqeqh60ZSimJQrkmc0oNdGY52Y-qg")
base <- get_map(c(left = -74.3, bottom = 40.4548, right =-73.690, top = 40.9233))

#now plot the district shapefiles on top of the base map
pdf("Figures/mergediagnostics_allmatches_assemblydists.pdf")
ggmap(base, extent = "normal", maprange = TRUE)+
 	geom_polygon(data = somedists_plot,
           aes(long, lat, group = group),
           fill = "blue", colour = "dodgerblue4", alpha = 0.05)+
	geom_point(aes(x = x, y = y), data = pointstorage, size=.1) #and add in the scattered points.
dev.off()

#########################################################################################
#now do the same thing, but for only unique matches? (not re-loading any data or basemap)
df_precollapse[, votercount:= nrow(.SD), by=list(idVoters)]
unique <- df_precollapse[votercount==1,]; dim(unique)

districts_unique <- sort(table(unique$ad)) #look at the tail of this-- which districts have more than a handful of people in them? 
distpoints_unique_all <- as.data.frame(tail(districts_unique, 65)); colnames(distpoints_unique_all) <- c("distnames", "pointscountfull")
distpoints_unique_all$pointscount <- round(distpoints_unique_all$pointscountfull / 10)

dists <- fortify(ADs) #get shapefiles ready to plot
somedists_unique <- subset(ADs, assem_dist %in% distpoints_unique_all$distnames)
somedists_unique_plot <- fortify(somedists_unique)

#now, draw some random points from within each district (as many as there are voters to plot, or maybe divide by 10 or 100)
numpoints <- sum(distpoints_unique_all$pointscount)
pointstorage_unique <- as.data.frame(matrix(NA, nrow=numpoints, ncol=3)); colnames(pointstorage_unique) <- c("id", "x", "y")
rownum <- 1
for (i in 1:nrow(distpoints_unique_all)){ #loop over districts and pull points. 
	id <- distpoints_unique_all$distnames[i] #set the district 
	pointcount <- distpoints_unique_all$pointscount[i] #how many points?
	newrownum <- rownum + pointcount-1 #for storage
	samp <- spsample(somedists_unique[which(somedists_unique$assem_dist == id), ], n= pointcount, type="random") #sample points
	pointstorage_unique$x[rownum:newrownum] <- samp$x #store it
	pointstorage_unique$y[rownum:newrownum] <- samp$y
	pointstorage_unique$id[rownum:newrownum] <-id
	rownum <- newrownum+1
}

#now plot the district shapefiles on top of the base map

pdf("Figures/mergediagnostics_uniquematches_assemblydists.pdf")
ggmap(base, extent = "normal", maprange = TRUE)+
 	geom_polygon(data = somedists_unique_plot,
           aes(long, lat, group = group),
           fill = "blue", colour = "dodgerblue4", alpha = 0.05)+
	geom_point(aes(x = x, y = y), data = pointstorage_unique, size=.1) #and add in the scattered points.
dev.off()


